Dataset Description:

options(warn = -1)
Sys.setenv(LANGUAGE = "en")
library(tidyverse)
library(plyr) #count()
library(GGally) #ggcorr() and ggpairs()
library(reshape) #melt()
library(corrplot) #corrplot
library(dplyr)
library(vcd)
#second library import
library(randomForest)
library(class)
library(caret)
library(ranger)
library(rsample)
library(e1071)
library(cluster)
library(factoextra)
#import
marketing=read.table(file = "marketing_campaign.csv", fill = TRUE, header = TRUE)
head(marketing)
##     ID Year_Birth  Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524       1957 Graduation         Single  58138       0        0  04-09-2012
## 2 2174       1954 Graduation         Single  46344       1        1  08-03-2014
## 3 4141       1965 Graduation       Together  71613       0        0  21-08-2013
## 4 6182       1984 Graduation       Together  26646       1        0  10-02-2014
## 5 5324       1981        PhD        Married  58293       1        0  19-01-2014
## 6 7446       1967     Master       Together  62513       0        1  09-09-2013
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1            0            0        0             3        11        1
## 2            0            0        0             3        11        0
## 3            0            0        0             3        11        0
## 4            0            0        0             3        11        0
## 5            0            0        0             3        11        0
## 6            0            0        0             3        11        0
str(marketing)
## 'data.frame':    2440 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : chr  "58138" "46344" "71613" "26646" ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : chr  "0" "1" "0" "0" ...
##  $ Dt_Customer        : chr  "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
##  $ Recency            : chr  "58" "38" "26" "26" ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...
nrow(marketing)
## [1] 2440
str(marketing)
## 'data.frame':    2440 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : chr  "58138" "46344" "71613" "26646" ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : chr  "0" "1" "0" "0" ...
##  $ Dt_Customer        : chr  "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
##  $ Recency            : chr  "58" "38" "26" "26" ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...

Convert the data to the numeric/Date type in order to use the model

#columns that will be made numeric
col.names<- names(marketing)
num.cols <- col.names[-c(1,3,4,8)]

marketing <- marketing %>%
  mutate_at(num.cols, as.numeric) %>%
  mutate(Dt_Customer = as.Date(Dt_Customer, format = "%d-%m-%Y")) #create Date column
head(marketing)
##     ID Year_Birth  Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524       1957 Graduation         Single  58138       0        0  2012-09-04
## 2 2174       1954 Graduation         Single  46344       1        1  2014-03-08
## 3 4141       1965 Graduation       Together  71613       0        0  2013-08-21
## 4 6182       1984 Graduation       Together  26646       1        0  2014-02-10
## 5 5324       1981        PhD        Married  58293       1        0  2014-01-19
## 6 7446       1967     Master       Together  62513       0        1  2013-09-09
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1            0            0        0             3        11        1
## 2            0            0        0             3        11        0
## 3            0            0        0             3        11        0
## 4            0            0        0             3        11        0
## 5            0            0        0             3        11        0
## 6            0            0        0             3        11        0

Drop missing value

marketing = na.omit(marketing)
nrow(marketing )
## [1] 2016
str(marketing)
## 'data.frame':    2016 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : num  1957 1954 1965 1984 1981 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : num  58138 46344 71613 26646 58293 ...
##  $ Kidhome            : num  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : num  0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : Date, format: "2012-09-04" "2014-03-08" ...
##  $ Recency            : num  58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : num  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : num  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : num  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : num  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : num  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : num  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : num  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : num  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: num  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : num  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : num  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : num  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : num  1 0 0 0 0 0 0 0 1 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:424] 11 20 21 29 39 40 46 49 50 52 ...
##   ..- attr(*, "names")= chr [1:424] "11" "20" "21" "29" ...
count(marketing$Marital_Status)
##          x freq
## 1   Absurd    2
## 2    Alone    3
## 3 Divorced  209
## 4  Married  777
## 5   Single  435
## 6 Together  517
## 7    Widow   71
## 8     YOLO    2

feature engineering - Marital_Status >> Rel_Status

Marital_Status currently has 8 different levels, some of them more populated than others and many being repetitive. While, some of the levels can be combined (‘Alone’ and ‘Single’ most likely describe the same experience) others cannot. We don’t know what an Absurd or YOLO Marital Status is and therefore we’ll have to handle these values differently. Thankfully, the unexpected values are few in between and most Customers can be categorized as Coupled (either Married or Together) and a similarly large proportion are Single (whether that be Alone, Single, Divorced, or Widowed).

#Create New Cohesive Categories
marketing$Rel_Status[marketing$Marital_Status %in% c('Alone', 'Divorced', 'Widow', 'Single')] <- 'Single'
marketing$Rel_Status[marketing$Marital_Status %in% c('Married', 'Together')] <- 'Coupled'
marketing$Rel_Status[marketing$Marital_Status %in% c('Absurd', 'YOLO')] <- '' #insert blanks to be handled later
# Drop rows where the value in 'column' is equal to 'condition'
marketing <- subset(marketing,Rel_Status != '')
nrow(marketing)
## [1] 2012
count(marketing$Rel_Status)
##         x freq
## 1 Coupled 1294
## 2  Single  718
count(marketing$Education)
##            x freq
## 1      Basic   54
## 2 Graduation 1115
## 3     Master  364
## 4        PhD  479

Outlier Detection - Income

ggplot(marketing, aes(x = Income)) +
    geom_boxplot()

Drop the outlier

outliers <- boxplot(marketing$Income, plot = FALSE)$out
marketing <- marketing %>%
    filter(Income < max(outliers) - 1)

Closer look at Z_CostContact,Z_Revenue

#look at unknown variables
summary(marketing$Z_CostContact)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3
ggplot(marketing, aes(x = Z_CostContact)) +
    geom_boxplot()

  • Z_CostContact is a constant, thus it is not helpful in predicting whether the customer would accepted the offer in the last campaign >> drop
summary(marketing$Z_Revenue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      11      11      11      11      11      11
ggplot(marketing, aes(x = Z_Revenue)) +
    geom_boxplot()

  • Z_Revenue is a constant, thus it is not helpful in predicting whether the customer would accepted the offer in the last campaign >> drop
#Date signed up and year of birth
ggplot(marketing, aes(Dt_Customer)) +
    geom_density(color = "darkblue", fill = "lightblue") +
    geom_vline(aes(xintercept = mean(Dt_Customer)), color = 'red', linetype = 'dashed', linewidth = 1)

ggplot(marketing, aes(Year_Birth)) +
    geom_density(color = "darkblue", fill = "lightblue") +
    geom_vline(aes(xintercept = mean(Year_Birth)), color = 'red', linetype = 'dashed', linewidth = 1)

  • A look at variables that refer to a date will give us more context to all of the customers we have.

  • The red dashed line in these plots represents the average of all customers. The average customer then joined around July of 2013 then and was borned near 1970.

  • There is little variation in when Customer’s enrolled with our company, but the data seems to be bound to customers and their data between July of 2012 and July of 2014.

  • Additionally, our company seems to be most populated by the Baby Boomer and X Generations, taking a decline when it comes to Millenials, and have no information on GenZers, though that could be accounted for by the fact that this data ended collection in 2014 when many GenZers were too young to make enrollment

Feature Engineering - Create new columns

marketing <- marketing %>%
    #creating new variables based off old ones
  mutate(MntSpent = MntFishProducts + MntMeatProducts + MntFruits + MntSweetProducts + MntWines + MntGoldProds) %>%
  mutate(NumPurchases = NumCatalogPurchases + NumStorePurchases + NumWebPurchases) %>%
  mutate(MinorsHome = Kidhome + Teenhome)  %>%
  mutate(AcceptedPrv = AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5) %>%
  mutate(Age = as.numeric(2023 - Year_Birth)) # Age is the age that registered to be the member, not the current age
#marketing <- marketing[order(marketing$column), ]

# Remove using subset


marketing <- marketing[, -c(1,2,4,6,7,27,28)]
new_order = sort(colnames(marketing))
marketing <- marketing[, new_order]
marketing = na.omit(marketing)
head(marketing)
##   AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedPrv
## 1            0            0            0            0            0           0
## 2            0            0            0            0            0           0
## 3            0            0            0            0            0           0
## 4            0            0            0            0            0           0
## 5            0            0            0            0            0           0
## 6            0            0            0            0            0           0
##   Age Complain Dt_Customer  Education Income MinorsHome MntFishProducts
## 1  66        0  2012-09-04 Graduation  58138          0             172
## 2  69        0  2014-03-08 Graduation  46344          2               2
## 3  58        0  2013-08-21 Graduation  71613          0             111
## 4  39        0  2014-02-10 Graduation  26646          1              10
## 5  42        0  2014-01-19        PhD  58293          1              46
## 6  56        0  2013-09-09     Master  62513          1               0
##   MntFruits MntGoldProds MntMeatProducts MntSpent MntSweetProducts MntWines
## 1        88           88             546     1617               88      635
## 2         1            6               6       27                1       11
## 3        49           42             127      776               21      426
## 4         4            5              20       53                3       11
## 5        43           15             118      422               27      173
## 6        42           14              98      716               42      520
##   NumCatalogPurchases NumDealsPurchases NumPurchases NumStorePurchases
## 1                  10                 3           22                 4
## 2                   1                 2            4                 2
## 3                   2                 1           20                10
## 4                   0                 2            6                 4
## 5                   3                 5           14                 6
## 6                   4                 2           20                10
##   NumWebPurchases NumWebVisitsMonth Recency Rel_Status Response
## 1               8                 7      58     Single        1
## 2               1                 5      38     Single        0
## 3               8                 4      26    Coupled        0
## 4               2                 6      26    Coupled        0
## 5               5                 5      94    Coupled        0
## 6               6                 6      16    Coupled        0

EDA - Understand Data through Visualization after removing outliers

#comprehensive boxplots
unwant.cols <- c('AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Complain',
             'Education', 'Rel_Status', 'Dt_Customer', 'Response', 'AcceptedPrv')
melt.marketing <- marketing %>%
    select(-one_of(unwant.cols)) %>%
    melt()
## Using  as id variables
ggplot(melt.marketing, aes(factor(variable), value)) +
    geom_boxplot(color = 'steelblue') +
    facet_wrap(~variable, scale = 'free') +
    labs(title = 'Boxplots of Various Variables', x = 'Variables', y = 'Ranges')

#remove outliers from age variable
outliers <- boxplot(marketing$Age, plot = FALSE)$out
marketing <- marketing %>%
    filter(Age < min(outliers))
#list of products
products <- c('MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')

#sum amounts spent on products and set these values in df
products.df <- marketing %>%
    select(products) %>% summarize_each(sum) %>%
    t() %>% as.data.frame() %>%
    rownames_to_column('Products')

#clean up structures
colnames(products.df) <- c('Products', 'Sums')
products <- gsub('Products', '', gsub(c('Mnt'), '', products))

#creating pie chart
ggplot(products.df, aes(x = '', y = Sums, fill = Products)) +
    geom_bar(stat = 'identity', width = 1, color = 'black') +
    geom_text(aes(label = paste('$', Sums)), color = 'white', position = position_stack(vjust = 0.5)) +
    coord_polar('y', start = 0) +
    labs(title = 'Percentage of Total Sales from Products', fill = 'Products', 
         caption = paste('Total Revenue: $', sum(products.df$Sums))) +
    scale_fill_discrete(labels = sort(products)) +
    theme(axis.ticks=element_blank(), axis.text.y=element_blank(), axis.text.x=element_text(colour='black'),
                axis.title=element_blank()) +
    scale_y_continuous(breaks = cumsum(products.df$Sums) - products.df$Sums / 2,
                       labels = paste(round(products.df$Sums/sum(products.df$Sums) * 100, 1), '%'))

purchase <- c('NumCatalogPurchases', 'NumStorePurchases', 'NumWebPurchases')

purchase.df <- marketing %>%
    select(purchase) %>% summarize_each(sum) %>%
    t() %>% as.data.frame() %>%
    rownames_to_column('Place')

colnames(purchase.df) <- c('Place', 'Sums')
purchase <- gsub('Purchases', '', gsub(c('Num'), '', purchase))

ggplot(purchase.df, aes(x = '', y = Sums, fill = Place)) +
    geom_bar(stat = 'identity', width = 1, color = 'black') +
    geom_text(aes(label = paste(Sums)), color = 'white', position = position_stack(vjust = 0.5)) +
    coord_polar('y', start = 0) +
    labs(title = 'Percentage of Total Num of Purchases', fill = 'Places', 
         caption = paste('Total Num: ', sum(purchase.df$Sums))) +
    scale_fill_discrete(labels = sort(purchase)) +
    theme(axis.ticks=element_blank(), axis.text.y=element_blank(), axis.text.x=element_text(colour='black'),
                axis.title=element_blank()) +
    scale_y_continuous(breaks = cumsum(purchase.df$Sums) - purchase.df$Sums / 2,
                       labels = paste(round(purchase.df$Sums/sum(purchase.df$Sums) * 100, 1), '%'))

#correlation plot between numeric vectors
Correlation_plot <- ggcorr(select(marketing, -one_of(unwant.cols)), 
                           geom = 'blank', label = TRUE, hjust = 1.2,wjust = 1, layout.exp = 3) +
    geom_point(size = 10, aes(color = coefficient > 0, alpha = abs(coefficient) > 0.6)) +
    scale_alpha_manual(values = c('TRUE' = 0.25, 'FALSE' = 0)) +
    guides(color = 'none', alpha = 'none') +
    labs(title = 'Correlation Map')

Correlation_plot

We can see the more interesting correlations in individual scatterplots:

#income v mntspent
ggplot(marketing, aes(x = MntSpent, y = Income)) +
    geom_point() +
    geom_smooth(method = lm) +
    labs(title = 'Income Against Amount Spent', x = 'Amount Spent ($)', y = 'Yearly Income ($)')
## `geom_smooth()` using formula = 'y ~ x'

#income by age
ggplot(marketing, aes(x = NumWebVisitsMonth, y = Income)) +
    geom_point() +
    geom_smooth(method = lm) +
    labs(title = 'Income Against Age', x = '# of Web Visits per Month', y = 'Yearly Income ($)')
## `geom_smooth()` using formula = 'y ~ x'

#pie chart of complaints
complaint.counts <- count(marketing$Complain)
ggplot(complaint.counts, aes(x = '', y = freq, fill = as.character(x))) +
    geom_bar(stat = 'identity', width = 1) +
    coord_polar('y', start = 0) +
    labs(title = 'Share of Complaints', subtitle = 'In the last 2 Years') +
    scale_fill_discrete(name = "Complant?", labels = c("No", "Yes")) +
    theme_void()

#boxplot Income by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = Income)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Previously Accepted Campaigns')

#boxplot Income by Response
ggplot(marketing, aes(x = as.character(Response), y = Income)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Response', y = 'Annual Income')

#boxplot Minors Home by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = MinorsHome)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Previously Accepted Campaigns', y = 'Kids at Home')

#boxplot amount spent by Minors Home
ggplot(marketing, aes(x = as.character(MinorsHome), y = MntSpent)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Kids at Home', y = 'Amount Spent')

#boxplot Age by accepted previous
ggplot(marketing, aes(x = as.character(AcceptedPrv), y = Age)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Previously Accepted Campaigns', y = 'Age')

#boxplot Recency by Response
ggplot(marketing, aes(x = as.character(Response), y = Recency)) +
    geom_boxplot(color = 'steelblue') +
    labs(x = 'Response', y = 'Recency')

ggplot(marketing, aes(x = as.character(AcceptedPrv), fill = Education)) +
    geom_bar(position = 'stack') +
    labs(x = 'Previously Accepted Campaigns', fill = 'Education')

chisq <- chisq.test(table(marketing$AcceptedPrv, marketing$Education))
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  table(marketing$AcceptedPrv, marketing$Education)
## X-squared = 12.976, df = 12, p-value = 0.3708
round(chisq$residuals, 3)
##    
##      Basic Graduation Master    PhD
##   0  0.808      0.195  0.121 -0.675
##   1 -0.684     -0.352  0.079  0.698
##   2 -1.400     -1.015 -0.060  2.074
##   3 -1.050      0.687 -0.523 -0.240
##   4 -0.544      0.771 -0.703 -0.381
#Relationship by whether Accepted previous campaign

ggplot(marketing, aes(x = as.character(AcceptedPrv), fill = as.character(Response) )) +
    geom_bar(position = 'stack') +
    labs(x = 'Previously Accepted Campaigns', fill = 'Response')

chisq <- chisq.test(table(marketing$AcceptedPrv, marketing$Response))
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  table(marketing$AcceptedPrv, marketing$Response)
## X-squared = 351.6, df = 4, p-value < 2.2e-16
round(chisq$residuals, 3)
##    
##          0      1
##   0  2.929 -6.872
##   1 -3.142  7.373
##   2 -3.025  7.098
##   3 -4.363 10.236
##   4 -2.723  6.390
#bar chart of most successful marketing campaign
cmps <- c('AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response')

#considering making a function out of this from how much I use these lines of code
cmp.df <- marketing %>%
    select(cmps) %>% summarize_each(sum) %>%
    t() %>% as.data.frame() %>%
    rownames_to_column('Campaigns') #two columns, one is name of column and the next is totals

#clean up the structure for easier manipulation
cmp.df <- cmp.df %>%
    mutate(Percents = V1 / nrow(marketing)) %>% #create percents
    select(-V1) #drop sums

#bar plot
ggplot(cmp.df, aes(y = reorder(Campaigns, Percents), x = Percents)) +
    geom_bar(stat = 'identity', fill = 'steelblue') +
    labs(x = 'Percentage', y = 'Campaigns')

str(marketing)
## 'data.frame':    2010 obs. of  28 variables:
##  $ AcceptedCmp1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp3       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedPrv        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Age                : num  66 69 58 39 42 56 52 38 49 73 ...
##  $ Complain           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dt_Customer        : Date, format: "2012-09-04" "2014-03-08" ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Income             : num  58138 46344 71613 26646 58293 ...
##  $ MinorsHome         : num  0 2 0 1 1 1 1 1 1 2 ...
##  $ MntFishProducts    : num  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntFruits          : num  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntGoldProds       : num  88 6 42 5 15 14 27 23 2 13 ...
##  $ MntMeatProducts    : num  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntSpent           : num  1617 27 776 53 422 ...
##  $ MntSweetProducts   : num  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntWines           : num  635 11 426 11 173 520 235 76 14 28 ...
##  $ NumCatalogPurchases: num  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumDealsPurchases  : num  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumPurchases       : num  22 4 20 6 14 20 17 8 5 1 ...
##  $ NumStorePurchases  : num  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebPurchases    : num  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumWebVisitsMonth  : num  7 5 4 6 5 6 6 8 9 20 ...
##  $ Recency            : num  58 38 26 26 94 16 34 32 19 68 ...
##  $ Rel_Status         : chr  "Single" "Single" "Coupled" "Coupled" ...
##  $ Response           : num  1 0 0 0 0 0 0 0 1 0 ...

Data Preprocessing - One-hot Encoding for Education/Rel_Status

# Perform one-hot encoding on the 'category' column
encoded_data <- model.matrix(~ Education - 1, data = marketing)
ohe_data <- cbind(marketing, encoded_data)
encoded_data <- model.matrix(~ Rel_Status - 1, data = ohe_data)
ohe_data <- cbind(ohe_data , encoded_data)
head( ohe_data )
##   AcceptedCmp1 AcceptedCmp2 AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedPrv
## 1            0            0            0            0            0           0
## 2            0            0            0            0            0           0
## 3            0            0            0            0            0           0
## 4            0            0            0            0            0           0
## 5            0            0            0            0            0           0
## 6            0            0            0            0            0           0
##   Age Complain Dt_Customer  Education Income MinorsHome MntFishProducts
## 1  66        0  2012-09-04 Graduation  58138          0             172
## 2  69        0  2014-03-08 Graduation  46344          2               2
## 3  58        0  2013-08-21 Graduation  71613          0             111
## 4  39        0  2014-02-10 Graduation  26646          1              10
## 5  42        0  2014-01-19        PhD  58293          1              46
## 6  56        0  2013-09-09     Master  62513          1               0
##   MntFruits MntGoldProds MntMeatProducts MntSpent MntSweetProducts MntWines
## 1        88           88             546     1617               88      635
## 2         1            6               6       27                1       11
## 3        49           42             127      776               21      426
## 4         4            5              20       53                3       11
## 5        43           15             118      422               27      173
## 6        42           14              98      716               42      520
##   NumCatalogPurchases NumDealsPurchases NumPurchases NumStorePurchases
## 1                  10                 3           22                 4
## 2                   1                 2            4                 2
## 3                   2                 1           20                10
## 4                   0                 2            6                 4
## 5                   3                 5           14                 6
## 6                   4                 2           20                10
##   NumWebPurchases NumWebVisitsMonth Recency Rel_Status Response EducationBasic
## 1               8                 7      58     Single        1              0
## 2               1                 5      38     Single        0              0
## 3               8                 4      26    Coupled        0              0
## 4               2                 6      26    Coupled        0              0
## 5               5                 5      94    Coupled        0              0
## 6               6                 6      16    Coupled        0              0
##   EducationGraduation EducationMaster EducationPhD Rel_StatusCoupled
## 1                   1               0            0                 0
## 2                   1               0            0                 0
## 3                   1               0            0                 1
## 4                   1               0            0                 1
## 5                   0               0            1                 1
## 6                   0               1            0                 1
##   Rel_StatusSingle
## 1                1
## 2                1
## 3                0
## 4                0
## 5                0
## 6                0
#ohe_data$Dt_Customer = as.factor(ohe_data$Dt_Customer)
ohe_data = ohe_data[,-c(34,27,10)]
ohe_data$Response = as.factor(ohe_data$Response)
str( ohe_data )
## 'data.frame':    2010 obs. of  31 variables:
##  $ AcceptedCmp1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp3       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedPrv        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Age                : num  66 69 58 39 42 56 52 38 49 73 ...
##  $ Complain           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dt_Customer        : Date, format: "2012-09-04" "2014-03-08" ...
##  $ Income             : num  58138 46344 71613 26646 58293 ...
##  $ MinorsHome         : num  0 2 0 1 1 1 1 1 1 2 ...
##  $ MntFishProducts    : num  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntFruits          : num  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntGoldProds       : num  88 6 42 5 15 14 27 23 2 13 ...
##  $ MntMeatProducts    : num  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntSpent           : num  1617 27 776 53 422 ...
##  $ MntSweetProducts   : num  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntWines           : num  635 11 426 11 173 520 235 76 14 28 ...
##  $ NumCatalogPurchases: num  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumDealsPurchases  : num  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumPurchases       : num  22 4 20 6 14 20 17 8 5 1 ...
##  $ NumStorePurchases  : num  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebPurchases    : num  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumWebVisitsMonth  : num  7 5 4 6 5 6 6 8 9 20 ...
##  $ Recency            : num  58 38 26 26 94 16 34 32 19 68 ...
##  $ Response           : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
##  $ EducationBasic     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ EducationGraduation: num  1 1 1 1 0 0 1 0 0 0 ...
##  $ EducationMaster    : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ EducationPhD       : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ Rel_StatusCoupled  : num  0 0 1 1 1 1 0 1 1 1 ...
ohe_data$Campaigns_Accepted = marketing$AcceptedPrv + marketing$Response

# Define the cut points and labels for the categorical variable
cut_points <- c(-1,0.5,10)
labels <- c(0, 1)

# Transform the continuous variable into a categorical variable
ohe_data$Campaigns_Accepted <- cut(ohe_data$Campaigns_Accepted, breaks = cut_points, labels = labels)
str(ohe_data)
## 'data.frame':    2010 obs. of  32 variables:
##  $ AcceptedCmp1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp3       : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedPrv        : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Age                : num  66 69 58 39 42 56 52 38 49 73 ...
##  $ Complain           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dt_Customer        : Date, format: "2012-09-04" "2014-03-08" ...
##  $ Income             : num  58138 46344 71613 26646 58293 ...
##  $ MinorsHome         : num  0 2 0 1 1 1 1 1 1 2 ...
##  $ MntFishProducts    : num  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntFruits          : num  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntGoldProds       : num  88 6 42 5 15 14 27 23 2 13 ...
##  $ MntMeatProducts    : num  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntSpent           : num  1617 27 776 53 422 ...
##  $ MntSweetProducts   : num  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntWines           : num  635 11 426 11 173 520 235 76 14 28 ...
##  $ NumCatalogPurchases: num  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumDealsPurchases  : num  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumPurchases       : num  22 4 20 6 14 20 17 8 5 1 ...
##  $ NumStorePurchases  : num  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebPurchases    : num  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumWebVisitsMonth  : num  7 5 4 6 5 6 6 8 9 20 ...
##  $ Recency            : num  58 38 26 26 94 16 34 32 19 68 ...
##  $ Response           : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
##  $ EducationBasic     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ EducationGraduation: num  1 1 1 1 0 0 1 0 0 0 ...
##  $ EducationMaster    : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ EducationPhD       : num  0 0 0 0 1 0 0 1 1 1 ...
##  $ Rel_StatusCoupled  : num  0 0 1 1 1 1 0 1 1 1 ...
##  $ Campaigns_Accepted : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 2 ...

Variable Selection - Random Forest

# Set the number of folds for cross-validation
k <- 5

# Create empty vectors to store the cross-validation results
accuracy <- numeric(k)
recall <- numeric(k)
precision <- numeric(k)
specificity <- numeric(k)

# Perform k-fold cross-validation
for (i in 1:k) {
  # Create training and testing indices for the current fold
  test_indices <- ((i - 1) * nrow(ohe_data) / k + 1):(i * nrow(ohe_data) / k)
  train_indices <- setdiff(1:nrow(ohe_data), test_indices)
  
  # Subset the data into training and testing sets
  train_data <- ohe_data[train_indices, ]
  test_data <- ohe_data[test_indices, ]
  train_data <- train_data[,-c(1,2,3,4,5,6,9,26)]
  test_data <- test_data[,-c(1,2,3,4,5,6,9,26)]
  
  # Train the Random Forest model
  model <- randomForest(Campaigns_Accepted ~ ., data = train_data,importance=T, ntree=2000, mtry=4)
  varImpPlot(model,main = "Variable Importance Plot",cex = 0.5)
  #print(varImp(model,scale=TRUE))
  
  predtreer <- predict(model, test_data)
  #Confusion matrix
  confusion=table(test_data$Campaigns_Accepted, predtreer);confusion
  
  # Predict the class labels for the testing set
  predictions <- predict(model, newdata = test_data)
  
  # Calculate and store the accuracy for the current fold
  accuracy[i] <- sum(predictions == test_data$Campaigns_Accepted) / length(predictions)
  cm <- confusionMatrix(predictions, test_data$Campaigns_Accepted, positive = '1')
  recall[i] <- cm$byClass['Sensitivity']
  precision[i] <- cm$byClass['Precision']
  specificity[i] <- cm$byClass['Specificity']
  
  
}

# Compute the average accuracy across all folds
mean_accuracy <- mean(accuracy)
avg_recall <- mean(recall)
avg_precision <- mean(precision)
avg_specificity <- mean(specificity)


# Print the cross-validation results
print(paste('accuracy:',accuracy))
## [1] "accuracy: 0.823383084577114" "accuracy: 0.818407960199005"
## [3] "accuracy: 0.810945273631841" "accuracy: 0.843283582089552"
## [5] "accuracy: 0.781094527363184"
print(paste('recall:',recall))
## [1] "recall: 0.56"              "recall: 0.585106382978723"
## [3] "recall: 0.53030303030303"  "recall: 0.564814814814815"
## [5] "recall: 0.430894308943089"
print(paste('precision:',precision))
## [1] "precision: 0.674698795180723" "precision: 0.617977528089888"
## [3] "precision: 0.833333333333333" "precision: 0.792207792207792"
## [5] "precision: 0.746478873239437"
print(paste('specificity:',specificity))
## [1] "specificity: 0.910596026490066" "specificity: 0.88961038961039" 
## [3] "specificity: 0.948148148148148" "specificity: 0.945578231292517"
## [5] "specificity: 0.935483870967742"
print(paste('mean accuracy:',mean_accuracy))
## [1] "mean accuracy: 0.815422885572139"
print(paste('mean recall:',avg_recall))
## [1] "mean recall: 0.534223707407932"
print(paste('mean precision:',avg_precision))
## [1] "mean precision: 0.732939264410234"
print(paste('mean specificity:',avg_specificity))
## [1] "mean specificity: 0.925883333301773"

Customer Segmentation - K-means clustering

# Perform k-means clustering
k <- 2 # Number of clusters
set.seed(101)  # Set a seed for reproducibility

# Specify the column names to extract


# Extract the specified columns from the dataframe

all_data.n <- marketing %>% dplyr::select(where(is.numeric))
#all_data.n$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
#all_data.n$Campaigns_Accepted <- ohe_data$Campaigns_Accepted
all_data.n <- as.data.frame(apply(all_data.n, 2, function(x) (x - min(x)) / (max(x) - min(x))))






kmeans_result_1 =  kmeans(all_data.n, centers = k)

# Access the cluster assignments
#cluster_assignments <- kmeans_result_1$cluster
clusplot(all_data.n, kmeans_result_1$cluster, color = T, shade = T, labels = 2, main = "K-means Clustering for K=2 using all numeric variables")

all_data <- marketing
#all_data$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
selected_cols <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')

kmeans_data <- all_data.n[, selected_cols]
#kmeans_data$Total_Campaigns_Accepted <- marketing$AcceptedPrv + marketing$Response
#kmeans_data <- as.data.frame(apply(kmeans_data, 2, function(x) (x - min(x)) / (max(x) - min(x))))

kmeans_result_2 = kmeans(kmeans_data, centers = k)

# Access the cluster assignments
cluster_assignments <- kmeans_result_2$cluster
clusplot(kmeans_data, kmeans_result_2$cluster, color = T, shade = T, labels = 2, main = "K-means Clustering for K=2 using variables with high importance score")

Assess the quality of clustering through silhouette width

# Load required package
# Calculate Silhouette coefficients

silhouette =  silhouette(kmeans_result_1$cluster, dist(all_data.n))

# Plot the Silhouette coefficients
# plot(silhouette, main = "Silhouette Plot for k-means Clustering using all numeric variables",col = c("red", "green"))
fviz_silhouette(silhouette)
##   cluster size ave.sil.width
## 1       1 1194          0.45
## 2       2  816          0.03

silhouette = silhouette(kmeans_result_2$cluster, dist(kmeans_data))

# Plot the Silhouette coefficients
#plot(silhouette, main = "Silhouette Plot for k-means Clustering using variables with high importance score",col = c("red", "green"))
fviz_silhouette(silhouette)
##   cluster size ave.sil.width
## 1       1 1151          0.69
## 2       2  859          0.30

Visualize the result of clustering

#library(ggplot2)
#library(ggpubr)
# Assuming you have a dataframe named 'data' containing your data, and 'target' is the column containing the class labels

# Install and load the necessary package

# Select the columns for pair plotting

plot_data <- marketing
plot_data$kmeans_cluster <- cluster_assignments
columns_to_plot <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')

# Add the 'target' column to the selected columns
columns_to_plot <- c(columns_to_plot, 'kmeans_cluster')

# Subset the data based on the selected columns
data_subset <- plot_data[, columns_to_plot]

data_subset$kmeans_cluster <- as.factor(data_subset$kmeans_cluster)
# Draw pair plots

ggpairs(data_subset, columns = 1:length(columns_to_plot), aes(color = kmeans_cluster)) +
  theme(axis.text = element_text(size = 5),
       strip.text.x = element_text(size = 7),
           strip.text.y = element_text(size = 4)
        )  # Adjust the font size
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The customer in Cluster 2 tends to

  • spend more

  • shop more frequently in stores

  • have higher Income

plot_data$kmeans_cluster <- factor(plot_data$kmeans_cluster)


ggplot(plot_data, aes(x = as.character(AcceptedPrv), fill = kmeans_cluster)) +
    geom_bar(position = 'stack') +
    labs(x = 'Previously Accepted Campaigns', 
         fill = 'Cluster',
         title = 'Distribution of Previously Accepted Campaigns Across Clusters')

plot_data$kmeans_cluster <- factor(plot_data$kmeans_cluster)


ggplot(plot_data, aes(x = as.character(Response), fill = kmeans_cluster)) +
    geom_bar(position = 'stack') +
    labs(x = 'if customer accepted the offer in the last campaign', 
         fill = 'Cluster',
         title = 'Customer Response Distribution by Previous Campaign Acceptance')

#library(ggplot2)
#library(ggpubr)
# Assuming you have a dataframe named 'data' containing your data, and 'target' is the column containing the class labels

# Install and load the necessary package

my_colors <- c("pink", "#339CFF") 

# Select the columns for pair plotting
columns_to_plot <- c('MntSpent','MntWines','MntMeatProducts','Income','MntGoldProds','NumStorePurchases')

# Add the 'target' column to the selected columns
columns_to_plot <- c(columns_to_plot, 'Campaigns_Accepted')

# Subset the data based on the selected columns
data_subset <- ohe_data[, columns_to_plot]

data_subset$Campaigns_Accepted <- as.factor(data_subset$Campaigns_Accepted)
# Draw pair plots
ggpairs(data_subset, columns = 1:length(columns_to_plot), aes(color = Campaigns_Accepted)) +
  theme(axis.text = element_text(size = 6), # Adjust the font size
       strip.text.x = element_text(size = 7),
           strip.text.y = element_text(size = 4)
        )  +
  scale_color_manual(values = my_colors) +
  scale_fill_manual(values = my_colors)# Set custom color palette
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Validate the association between

CampaignsAcceptedand the variables with high importance score

# Create a continuous variable

data <- marketing
data$Campaigns_Accepted <- ohe_data$Campaigns_Accepted

# Define the cut points and labels for the categorical variable
cut_points <- c(0,800,3000)
labels <- c("Low", "High")

# Transform the continuous variable into a categorical variable
data$MntSpent <- cut(data$MntSpent, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$MntSpent, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$MntSpent, data$Campaigns_Accepted)
## X-squared = 190.41, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntSpent, data$Campaigns_Accepted))
effect_size
##                     X^2 df P(> X^2)
## Likelihood Ratio 186.44  1        0
## Pearson          191.86  1        0
## 
## Phi-Coefficient   : 0.309 
## Contingency Coeff.: 0.295 
## Cramer's V        : 0.309
round(chisq$residuals, 3)
##       
##             0      1
##   Low   4.318 -6.975
##   High -5.875  9.489
corrplot(chisq$residuals, is.cor = FALSE)

# Create a continuous variable
data <- marketing
data$Campaigns_Accepted <- ohe_data$Campaigns_Accepted
# Define the cut points and labels for the categorical variable
cut_points <- c(0,500,3000)
labels <- c("Low", "High")

# Transform the continuous variable into a categorical variable
data$MntWines <- cut(data$MntWines, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$MntWines, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$MntWines, data$Campaigns_Accepted)
## X-squared = 251.54, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntWines, data$Campaigns_Accepted))
effect_size
##                     X^2 df P(> X^2)
## Likelihood Ratio 237.57  1        0
## Pearson          253.34  1        0
## 
## Phi-Coefficient   : 0.356 
## Contingency Coeff.: 0.335 
## Cramer's V        : 0.356
round(chisq$residuals, 3)
##       
##             0      1
##   Low   4.318 -6.961
##   High -7.194 11.597
corrplot(chisq$residuals, is.cor = FALSE)

# Define the cut points and labels for the categorical variable
cut_points <- c(0,300,3000)
labels <- c("Low", "High")

# Transform the continuous variable into a categorical variable
data$MntMeatProducts <- cut(data$MntMeatProducts, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$MntMeatProducts, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$MntMeatProducts, data$Campaigns_Accepted)
## X-squared = 128.35, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$MntMeatProducts, data$Campaigns_Accepted))
effect_size
##                     X^2 df P(> X^2)
## Likelihood Ratio 119.94  1        0
## Pearson          129.76  1        0
## 
## Phi-Coefficient   : 0.254 
## Contingency Coeff.: 0.246 
## Cramer's V        : 0.254
round(chisq$residuals, 3)
##       
##             0      1
##   Low   2.702 -4.363
##   High -5.353  8.646
corrplot(chisq$residuals, is.cor = FALSE)

# Define the cut points and labels for the categorical variable
cut_points <- c(0,80,1000)
labels <- c("Low", "High")

# Transform the continuous variable into a categorical variable
data$MntGoldProds <- cut(data$MntGoldProds, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$MntGoldProds, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$MntGoldProds, data$Campaigns_Accepted)
## X-squared = 24.114, df = 1, p-value = 9.08e-07
effect_size <- assocstats(table(data$MntGoldProds, data$Campaigns_Accepted))
effect_size
##                     X^2 df   P(> X^2)
## Likelihood Ratio 23.561  1 1.2101e-06
## Pearson          24.761  1 6.4913e-07
## 
## Phi-Coefficient   : 0.113 
## Contingency Coeff.: 0.112 
## Cramer's V        : 0.113
round(chisq$residuals, 3)
##       
##             0      1
##   Low   1.121 -1.814
##   High -2.364  3.825
corrplot(chisq$residuals, is.cor = FALSE)

# Define the cut points and labels for the categorical variable
cut_points <- c(0,50000,200000)
labels <- c("Low","High")

# Transform the continuous variable into a categorical variable
data$Income <- cut(data$Income, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$Income, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$Income, data$Campaigns_Accepted)
## X-squared = 82.409, df = 1, p-value < 2.2e-16
effect_size <- assocstats(table(data$Income, data$Campaigns_Accepted))
effect_size
##                     X^2 df P(> X^2)
## Likelihood Ratio 85.236  1        0
## Pearson          83.318  1        0
## 
## Phi-Coefficient   : 0.204 
## Contingency Coeff.: 0.2 
## Cramer's V        : 0.204
round(chisq$residuals, 3)
##       
##             0      1
##   Low   3.494 -5.644
##   High -3.298  5.327
corrplot(chisq$residuals, is.cor = FALSE)

# Define the cut points and labels for the categorical variable
cut_points <- c(0,5,100)
labels <- c("Low", "High")

# Transform the continuous variable into a categorical variable
data$NumStorePurchases <- cut(data$NumStorePurchases, breaks = cut_points, labels = labels)

chisq <- chisq.test(table(data$NumStorePurchases, data$Campaigns_Accepted))
chisq
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$NumStorePurchases, data$Campaigns_Accepted)
## X-squared = 61.444, df = 1, p-value = 4.554e-15
effect_size <- assocstats(table(data$NumStorePurchases, data$Campaigns_Accepted))
effect_size
##                     X^2 df   P(> X^2)
## Likelihood Ratio 62.006  1 3.4417e-15
## Pearson          62.235  1 2.9976e-15
## 
## Phi-Coefficient   : 0.177 
## Contingency Coeff.: 0.174 
## Cramer's V        : 0.177
round(chisq$residuals, 3)
##       
##             0      1
##   Low   2.761 -4.444
##   High -3.116  5.015
corrplot(chisq$residuals, is.cor = FALSE)